#!/bin/bash -x

# files expected: 
# /home/sandesh247/year_count_weights
# /home/sandesh247/known_lifespans
# 

YEAR_COUNTS_WEIGHTED="false"
YEAR_COUNTS_TRIM="0.0"

JAR="build/jar/sandesh247-lab-java.jar"
LIBJARS="lib/cloud9-1.2.3.jar,lib/bliki-core-3.0.16.jar,lib/commons-lang-2.6.jar"

HDFS_ROOT="/tmp_active_dates_scratch"
WIKI_XML_HDFS_PATH="/active_dates/enwiki-20110901-pages-articles.xml" 

EVALUATION_PREFIX="${YEAR_COUNTS_TRIM}_${YEAR_COUNTS_WEIGHTED}"
YEAR_COUNT_HDFS_PATH="${HDFS_ROOT}/YearCounts_${EVALUATION_PREFIX}"
BIRTHDATES_HDFS_PATH="${HDFS_ROOT}/birthdates"
EVALUATION_HDFS_PATH="${HDFS_ROOT}/AllEvaluation_${EVALUATION_PREFIX}"
GENDER_HDFS_PATH="${HDFS_ROOT}/gender"


function run_mrs() {
	hadoop dfs -rmr ${HDFS_ROOT}

	time hadoop jar $JAR snippets.cse524.activeDates.YearCountsMR \
		-libjars $LIBJARS \
		${WIKI_XML_HDFS_PATH} \
		${YEAR_COUNT_HDFS_PATH} \
		0.0 false false


	time hadoop jar $JAR snippets.cse524.activeDates.BirthdateMR \
		-libjars $LIBJARS \
		${WIKI_XML_HDFS_PATH} \
		${BIRTHDATES_HDFS_PATH}


	time hadoop jar $JAR snippets.cse524.activeDates.AllIntervalEvaluator \
		-libjars $LIBJARS \
		${YEAR_COUNT_HDFS_PATH} \
		${EVALUATION_HDFS_PATH} ${EVALUATION_PREFIX}

	time hadoop jar $JAR snippets.cse524.activeDates.GenderEvaluatorMR \
		-libjars $LIBJARS \
		${WIKI_XML_HDFS_PATH} \
		${GENDER_HDFS_PATH}
}

function extract_mr_output() {
	echo "now processing";
	hadoop dfs -cat ${GENDER_HDFS_PATH}/part* > $(basename ${GENDER_HDFS_PATH})
	hadoop dfs -cat ${BIRTHDATES_HDFS_PATH}/part* > $(basename ${BIRTHDATES_HDFS_PATH})
	hadoop dfs -cat ${EVALUATION_HDFS_PATH}/part*  > $(basename ${EVALUATION_HDFS_PATH})
}

function process_mr_output() {
	# the 9th and 10th fields are the best one determined emperical
	cut -f1,9,10 $(basename ${EVALUATION_HDFS_PATH}) > $(basename ${EVALUATION_HDFS_PATH}).best 
}

run_mrs

extract_mr_output

process_mr_output
